manhattan_rides_df <- read_csv("manhattan_rides.csv")
manhattan_rides_df <-
manhattan_rides_df %>%
mutate(
day_of_week = factor(day_of_week, ordered = T,
levels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")),
year = factor(year),
age_group = factor(age_group, ordered = T,
levels = c("18-25","26-35", "36-45", "46-55", "56-65", "66-85")),
gender = type.convert(gender, as.is = F))
manhattan_rides_df %>%
group_by(start_date, year) %>%
summarize(obs = n()) %>%
ggplot(aes(x = start_date, y = obs, group = year, color = year)) +
geom_line() +
geom_smooth(se = FALSE) +
labs(title = "Number of CitiBike Rides in 2019 and 2020") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, color = rep(c("black", rep("transparent", each = 10)), 365))) +
theme(plot.title = element_text(hjust = 0.5))

Overall and as expected, there were more CitiBike rides in 2019 beginning in the peak months of the pandemic in 2020.
monthly_Q1 <- quantile(pull(manhattan_rides_df, trip_min), probs = 0.25)
monthly_Q3 <- quantile(pull(manhattan_rides_df, trip_min), probs = 0.75)
monthly_inter_quart <- IQR(pull(manhattan_rides_df, trip_min))
monthly_rides_df <-
manhattan_rides_df %>%
filter(
trip_min >= monthly_Q1 - 1.5*monthly_inter_quart,
trip_min <= monthly_Q3 + 1.5*monthly_inter_quart
)
monthly_rides_df %>%
group_by(year) %>%
mutate(
month = month(starttime, label = T)
) %>%
plot_ly(
x = ~month,
y = ~trip_min,
color = ~year,
type = "box",
colors = "viridis") %>%
layout(
boxmode = "group",
title = "Duration of Citibike Rides by Month",
xaxis = list(title = "Month"),
yaxis = list(title = "Trip Duration in Minutes")
)
Looks like maybe the overall length of trips in 2019 was more consistent. 2020 had a bump in duration of rides, starting in April. Overall, trip length seems more variable in 2020.
manhattan_rides_df %>%
group_by(year) %>%
mutate(
month = month(starttime, label = T)
) %>%
group_by(year, month) %>%
summarise(obs = n()) %>%
plot_ly(
x = ~month,
y = ~obs,
color = ~year,
type = "scatter",
mode = "lines",
colors = "viridis") %>%
layout(
title = "Number of Citibike Rides per Month",
xaxis = list(title = "Month"),
yaxis = list(title = "Rides")
)
Huge drop in monthly trips in April 2020. Lockdown started mid/late March so this coincides with people transitioning to WFH and largely staying inside to minimize contacts. The ride numbers bounce back quite a bit after this but not to 2019 levels.
# the code below saves trip_min, gender, age_group variables of manhattan_rides_df
trip_dur_age_gender_df <-
manhattan_rides_df %>%
select(trip_min, gender, age_group)
# looking at the data we can tell there are massive outliers in trip duration
# so i am going to filter them out using the IQR method
Q1 <- quantile(pull(trip_dur_age_gender_df, trip_min), probs = 0.25)
Q3 <- quantile(pull(trip_dur_age_gender_df, trip_min), probs = 0.75)
inter_quart <- IQR(pull(trip_dur_age_gender_df, trip_min))
trip_dur_age_gender_df <-
trip_dur_age_gender_df %>%
filter(
trip_min >= Q1 - 1.5*inter_quart,
trip_min <= Q3 + 1.5*inter_quart
)
#this generates a plot of boxplots of trip duration (minutes) by gender and age_group
trip_dur_age_gender_df %>%
mutate(
gender = str_to_sentence(gender)
) %>%
plot_ly(x = ~age_group, y = ~trip_min, color = ~gender, type = "box", colors = "viridis") %>%
layout(
boxmode = "group",
xaxis = list(title = "Age Range"),
yaxis = list(title = "Trip Duration (min)"),
title = "Average Trip Duration (in minutes) in 2019 and 2020 by Age Group and Gender",
legend = list(title = list(text = "<b> Gender </b>"))
)
We can see by comparing averages, females take longer rides than males across all age groups. Interestingly though there are more outliers for trip duration among males than females. On average younger riders take longer rides compared to older riders. Rides never surpassed 35 minutes. This makes sense because Citi Bike members get charged extra fees for trips over 40 mins.
# this dataframe groups rides by gender and month
# and provides the average age for each gender in that month
# along with the standard deviation, standard error, lower bound, and upper bound
# which are then used to create a plotly graph where we get average age per month for each gender
# with 95% confidence bands around each line
avg_age_per_month_df <-
read_csv("manhattan_rides.csv") %>%
mutate(
date = floor_date(as_date(starttime), "month")
) %>%
select(date, gender, age) %>%
group_by(date, gender) %>%
summarize(
total = n(),
avg_age = mean(age),
sd_age = sd(age)
) %>%
mutate(
sem = sd_age/sqrt(total - 1),
lower_bound = avg_age + qt(0.025, df = total - 1) * sem,
upper_bound = avg_age - qt(0.025, df = total - 1) * sem
) %>%
ungroup()
avg_age_plot <-
avg_age_per_month_df %>%
mutate(
gender = str_to_sentence(gender)
) %>%
ggplot(aes(x = date, y = avg_age, color = gender)) +
geom_line(size = 1, alpha = 0.8) +
geom_ribbon(aes(ymin = lower_bound, ymax = upper_bound), alpha = 0.2)
ggplotly(avg_age_plot) %>%
layout(
xaxis = list(title = "Date"),
yaxis = list(title = "Age"),
title = list(text = "Average Age of CitiBike Riders in 2019 and 2020 by Gender", font = list(size = 16)),
legend = list(title = list(text = "<b> Gender </b>"))
)
We see from this graph that across 2019 and 2020, the average age of male and female riders follow the same general trajectory. At all time points over this period, the average age of female riders is younger than the average age of male riders. The biggest age discrepancies appear in the summer months of both 2019 and 2020.